This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

##all viruses - target and off target

library(tidyverse)
── Attaching core tidyverse packages ──────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     ── Conflicts ────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(ggpubr)
library(tidyplots)

Attaching package: ‘tidyplots’

The following object is masked from ‘package:ggpubr’:

    gene_expression
library(ggthemes)
library(scales)

Attaching package: ‘scales’

The following object is masked from ‘package:purrr’:

    discard

The following object is masked from ‘package:readr’:

    col_factor

Figure 1 - linear model

Read-in data

#Linear model for TE data - separated by virus
#November 2024

#viral read depth
#use bowtie2 data

library(tidyverse)
library(broom)
library(boot)

####read counts/normalised read counts####

#metadata

metadata <-
  read.csv(
    "metadata/sampleIDs_TESpikeIn.csv",
    header = TRUE
  )

metadata2 <- metadata |>
  select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import and combine read count files
#deduplicated and non-deduplicated

counts_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)

counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")

####read depths####

#import and combine read depth files

depth_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

depth_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

Format data


depths_bt_all <- rbind(depth_dedup_bt, depth_nodedup_bt)

depths_reads <- left_join(depths_bt_all, metadata2, by = "Sample_id") |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  ,
  .default = "RNA"
  ))

####linear model for spike in viruses - mean read depth####

depths_reads_sub <- depths_reads |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  mutate(log_depth = log10(mean_depth)) |>
  mutate(log_viral_load = log10(Viral.load))

#inspect data
hist(depths_reads_sub$mean_depth)
hist(depths_reads_sub$Viral.load)
hist(depths_reads_sub$log_depth)
hist(depths_reads_sub$log_viral_load)

summary(depths_reads_sub$log_depth)
summary(depths_reads_sub$log_viral_load)

model

#all viruses together

lm1 <-
  glm(log_depth ~ log_viral_load * virus,
      data = depths_reads_sub,
      family = "gaussian")

summary(lm1)

glm.diag.plots(lm1)

pred <- predict(lm1, type = "response")

rsq <- function (x, y) {
  cor(x, y) ^ 2
}

rsq(depths_reads_sub$log_depth, pred)

##split by viruses

cols <- c("#4477AA",
          "#66CCEE",
          "#228833",
          "#CCBB44",
          "#EE6677",
          "#AA3377")

facet_names <- c(
  "Human_adenovirus_40" = "Human adenovirus 40",
  "Human_betaherpesvirus" = "Human betaherpesvirus",
  "Human_respiratory_syncytial_virus" = "Human respiratory syncytial virus",
  "Influenza_B_virus" = "Influenza B virus",
  "Mammalian_orthoreovirus3" = "Mammalian orthoreovirus 3",
  "Zika_virus" = "Zika virus"
)

virus <- (unique(depths_reads_sub$virus)) %>%
  rep(., each = 2) %>%
  data.frame()

list_models <- 
  depths_reads_sub |>
  group_split(virus) |>
  map( ~ lm(log_depth ~ log_viral_load, data = .))

lm_tidy <- 
  map(list_models, broom::tidy) %>%
  do.call(rbind.data.frame, .) %>%
  cbind(virus, .) %>%
  rename(virus = ".") %>%
  select(virus, term, estimate) %>%
  pivot_wider(names_from = "term", values_from = "estimate")

lm_summary <- depths_reads_sub |>
  group_by(virus) |>
  summarise(
    Intercept = lm(log_depth ~ log_viral_load)$coefficients[1],
    Coeff_x1 = lm(log_depth ~ log_viral_load)$coefficients[2],
    R2 = summary(lm(log_depth ~ log_viral_load))$r.squared,
    pvalue = summary(lm(log_depth ~ log_viral_load))$coefficients["log_viral_load", 4]
  )

lm_combined <- 
  left_join(lm_tidy, lm_summary, by = "virus")

Make plot

Final plot

#ggsave("figures/compare_spike_ins_atcc/model_readdepth_dedup.png")

ggsave("figures/manuscript_figure_2025/PDF/Figure_1.pdf",
       width = 8,
       height = 5)

ggsave("figures/manuscript_figure_2025/PNG/Figure_1.png",
       width = 8,
       height = 5)

——————–

>>>> now redundant? - Figure 1: plots of TE experiment data - read counts and read depths

Read-in data

#ATCC genomes updated version
#October 2024

#viral load vs read count normalised using different methods, comparing deduplicated and non-deduplicated
#use bowtie2 data

####read counts/normalised read counts####

#metadata

metadata <- read.csv("metadata/sampleIDs_TESpikeIn.csv", header = TRUE)

metadata2 <- metadata |>
  select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import and combine read count files
#deduplicated and non-deduplicated

counts_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)

counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")

#normalise by both raw read count and genome length - should be the same as mean read depth

counts_reads_norm <- counts_reads |>
  mutate(norm_counts1 = matched / QC_reads) |>
  mutate(norm_counts2 = matched / QC_reads / length) |>
  mutate(norm_counts3 = matched / length) |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  .default = "RNA"
  ))

Format data

#change labels in facet plots

facet_names <- c("dedup_TE" = "Deduplicated",
                 "nodedup_TE" = "Non-Deduplicated")

cols <- c("#4477AA",
          "#66CCEE",
          "#228833",
          "#CCBB44",
          "#EE6677",
          "#AA3377")

Make plots


#plot viral read counts (non normalised)

read_count <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(matched),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  ylab("log10(Viral Reads)") +
  xlab("Log (Viral load)") + 
  scale_x_log10(labels = scales::trans_format("log10", scales::label_math()))

#ggsave("figures/compare_spike_ins_atcc/read_counts.pdf",width=8,height=6)

#normalise by raw read count

read_count_norm1 <- counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(norm_counts1),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  ylab("log10(viral reads/cleaned reads)") +
  xlab("Log (Viral load)") + 
  scale_x_log10(labels = scales::trans_format("log10", scales::label_math()))

#normalise by raw read count and genome length

read_count_norm2 <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = (Viral.load),
    y = log10(norm_counts2),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  ylab("Log (viral reads/cleaned reads/genome length)") +
  xlab("Log (Viral load)") + 
  scale_x_log10(labels = scales::trans_format("log10", scales::label_math()))

plot just the two lefthand panels from the original figure

plot viral read counts (non normalised)


##deduplicated only

read_count_dedup <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(matched),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE, method = "lm") +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  ylab("Log (Viral Reads)") +
  xlab("Log (Viral load)") + 
  scale_x_log10(labels = scales::trans_format("log10", scales::label_math()))

#normalise by raw read count and genome length

read_count_norm2_dedup <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(norm_counts2),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE, method = "lm") +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  ylab("Log (viral reads/cleaned reads/genome length)") +
  xlab("Log (Viral load)") + 
  scale_x_log10(labels = scales::trans_format("log10", scales::label_math()))

Final plot

——————–

Figure S1

Read depths

####read depths####

#import and combine read depth files

depth_dedup_bt <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv"
  )

depth_nodedup_bt <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv"
  )

depths_bt_all <- rbind(depth_dedup_bt, depth_nodedup_bt)

depths_reads <- 
  depths_bt_all |>
  left_join(metadata2, by = "Sample_id") |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  .default = "RNA"
  )) |>
  rename(Viral.load = `Viral load`)

Make plots


read_depths <- 
  depths_reads |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(mean_depth),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  ylab("Log (mean read depth)") +
  xlab("Log (Viral load)") + 
  scale_x_log10(labels = scales::trans_format("log10", scales::label_math()))

#ggsave("figures/compare_spike_ins_atcc/mean_depth.pdf",width=8,height=6)

Final plot

——————–

Figure S2

Read-in data / Make plots

#metadata

metadata <-
  read.csv(
    "metadata/sampleIDs_TESpikeIn.csv",
    header = TRUE
  )

metadata2 <- metadata |>
  select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import and combine read count files
#deduplicated and non-deduplicated

counts_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)

counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")

#normalise by both raw read count and genome length - should be the same as mean read depth

counts_reads_norm <- counts_reads |>
  mutate(norm_counts1 = matched / QC_reads) |>
  mutate(norm_counts2 = matched / QC_reads / length) |>
  mutate(norm_counts3 = matched / length) |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  .default = "RNA"
  ))

####compare library capture pool####

cols3 <- c("#228833", "#AA3377")

facet_names <- c("dedup_TE" = "Deduplicated",
                 "nodedup_TE" = "Non-Deduplicated")

metadata3 <- 
  metadata |>
  select(Sample.ID, Pool.for.sequencing) |>
  rename(Sample_id = Sample.ID) |>
  rename(Pool = Pool.for.sequencing)

counts_pool <- left_join(counts_reads_norm, metadata3, by = "Sample_id")

Make plots


pool_counts_sum <- 
  counts_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = as.character(Viral.load),
    y = log10(matched),
    colour = Pool
  )) +
  geom_boxplot() +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    legend.title = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  ylab("Log (Viral Reads)") + xlab("Log (Viral load)") +
  scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2")) + 
  scale_x_discrete(labels = c(bquote(10^{2}), bquote(10^{3}), bquote(10^{5})))
  

# pool_readcounts_norm <- 
#   counts_pool |>
#   filter(Background != "p6") |>
#   filter(Background != "control") |>
#   ggplot(aes(x = virus, y = log10(norm_counts1))) +
#   geom_boxplot() +
#   facet_grid(Pool ~ type) +
#   theme_few() +
#   theme(
#     axis.title.x = element_blank(),
#     # axis.text.x = element_blank(),
#     axis.title.y = element_text(size = 10),
#     panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
#     panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
#   ) +
#   ylab("log10(viral reads/cleaned reads)")

pool_norm1_sum <- 
  counts_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = as.character(Viral.load),
    y = log10(matched),
    colour = Pool
  )) +
  geom_boxplot() +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    legend.title = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray") 
  ) +
  ylab("log10(viral reads/cleaned reads)") + xlab("Log (Viral load)") +
  scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2")) + 
    scale_x_discrete(labels = c(bquote(10^{2}), bquote(10^{3}), bquote(10^{5})))

# pool_readcounts_norm2 <- 
#   counts_pool |>
#   filter(Background != "p6") |>
#   filter(Background != "control") |>
#   ggplot(aes(x = virus, y = log10(norm_counts2))) +
#   geom_boxplot() +
#   facet_grid(Pool ~ type) +
#   theme_few() +
#   theme(
#     axis.title.x = element_blank(),
#     # axis.text.x = element_blank(),
#     axis.title.y = element_text(size = 10),
#     panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
#     panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
#   ) +
#   ylab("log10(viral reads/cleaned reads/genome length)")


pool_norm2_sum <- 
  counts_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = as.character(Viral.load),
    y = log10(matched),
    colour = Pool
  )) +
  geom_boxplot() +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    legend.title = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  ylab("log10(viral reads/cleaned reads/genome length)") + xlab("Log (Viral load)") +
  scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2")) + 
    scale_x_discrete(labels = c(bquote(10^{2}), bquote(10^{3}), bquote(10^{5})))


#import and combine read depth files

depth_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

depth_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

depths_bt_all <- rbind(depth_dedup_bt, depth_nodedup_bt)

depths_reads <- 
  left_join(depths_bt_all, metadata2, by = "Sample_id") |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  .default = "RNA"
  ))

depths_pool <- left_join(depths_reads, metadata3, by = "Sample_id")

# pool_readdepths <- 
#   depths_pool |>
#   filter(Background != "p6") |>
#   filter(Background != "control") |>
#   ggplot(aes(x = virus, y = log10(mean_depth))) +
#   geom_boxplot() +
#   facet_grid(Pool ~ type) +
#   theme_few() +
#   theme(
#     axis.title.x = element_blank(),
#     axis.text.x = element_text(angle = 45, hjust = 1),
#     axis.title.y = element_text(size = 10),
#     panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
#     panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
#   ) +
#   ylab("Log (mean read depth)")

pool_depths_sum <-
  depths_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = as.character(Viral.load),
    y = log10(mean_depth),
    colour = Pool
  )) +
  geom_boxplot() +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    legend.title = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  ylab("Log (mean read depth)") +
  xlab("Log (Viral load)") + 
  scale_x_discrete(labels = c(bquote(10^{2}), bquote(10^{3}), bquote(10^{5}))) +
  scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2"))

Final plot

ggarrange(
  pool_counts_sum,
  pool_norm1_sum,
  pool_norm2_sum,
  pool_depths_sum,
  nrow = 2,
  ncol = 2,
  common.legend = TRUE, 
  align = "hv"
)
Error: object 'pool_counts_sum' not found

——————–

Figure S3


##plots of read counts and viral read counts
#November 2024

#metadata

metadata <-
  read.csv(
    "metadata/sampleIDs_TESpikeIn.csv",
    header = TRUE
  )

metadata2 <- metadata |>
  select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import read count files

counts_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

#import viral reads mapped (to calculate proportions)

viral_reads_dedup <-
  read.csv(
    "data_TE/total_virus_mapped_reads_per_sample_dedup_atcc_ref_20241108.csv",
    header = TRUE
  )

viral_reads_nodedup <-
  read.csv(
    "data_TE/total_virus_mapped_reads_per_sample_nodedup_atcc_ref_20241108.csv",
    header = TRUE
  )

cols2 <- c("#BB5566", "#004488")

facet_names <- c("dedup_TE" = "Deduplicated",
                 "nodedup_TE" = "Non-Deduplicated")

reads_metadata_dedup <-
  left_join(counts_dedup_bt, metadata2, by = "Sample_id")

reads_viral_dedup <-
  left_join(reads_metadata_dedup, viral_reads_dedup, by = "Sample_id")

reads_metadata_nodedup <-
  left_join(counts_nodedup_bt, metadata2, by = "Sample_id")

reads_viral_nodedup <-
  left_join(reads_metadata_nodedup, viral_reads_nodedup, by = "Sample_id")

reads_viral_all <- rbind(reads_viral_dedup, reads_viral_nodedup)


reads_plot <- 
  reads_viral_all |>
  group_by(Background, Sample_id, Viral.load, type) |>
  summarise(
    total_reads = (QC_reads * 2),
    viral_reads = total_virus_reads,
    ATCC_reads = sum(matched),
    prop_ATCC = ATCC_reads / total_reads,
    prop_viral = total_virus_reads / total_reads,
    diff = viral_reads - ATCC_reads
  ) |>
  unique()
Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in dplyr 1.1.0.
Please use `reframe()` instead.
When switching from `summarise()` to `reframe()`, remember that `reframe()` always returns an ungrouped data frame and adjust accordingly.`summarise()` has grouped output by 'Background', 'Sample_id', 'Viral.load', 'type'. You can override using the `.groups` argument.

Make plots

Final plot

——————–

No longer required? Figure S4

Make plots

####read counts/depths split by background####

#change labels in facet plots

facet_names_bg <- c(
  "dedup_TE" = "Deduplicated",
  "nodedup_TE" = "Non-Deduplicated",
  "p2" = "P1",
  "p8" = "P2"
)

#break down by Background x virus

background_counts_reads <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(matched),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels =
      c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
      )
  ) +
  scale_x_log10(labels = scales::label_log10()) +
  ylab("log10(Viral Reads)")

backgrounds_counts_reads_norm <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(norm_counts1),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
  theme_bw() +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  scale_x_log10(label = scales::label_log10()) +
  ylab("log10(viral reads/cleaned reads)")

backgrounds_counts_reads_norm2 <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(norm_counts2),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  scale_x_log10(labels = scales::label_log10()) +
  ylab("log10(viral reads/cleaned reads/genome length)")

background_read_depths <- 
  depths_reads |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(mean_depth),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray"),
    axis.title.y = element_text(size = 10),
    legend.title = element_blank()
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  scale_x_log10(label = scales::label_log10()) +
  ylab("log10(mean read depth)")

Final plot

——————–

——————–

>>> Figure 2: plots of TE experiment data - genome coverage & per site coverage

Read-in data


#plots of TE experiment data - genome coverage & per site coverage
#also separated by viruses
#ATCC genomes updated version
#October 2024

#use bowtie2 data

####genome coverage####

unzip(zipfile = "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv.zip", exdir = "data_TE/")

unzip(zipfile = "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv.zip", exdir = "data_TE/")

dedup_per_site <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv"
  )


nodedup_per_site <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv"
  )

file.remove("data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv")

file.remove( "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv")

#add length column from read depth file to per site file

#metadata

metadata <-
  read.csv(
    "metadata/sampleIDs_TESpikeIn.csv",
    header = TRUE
  )

metadata2 <- 
  metadata |>
  select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import and combine read count files

#deduplicated and non-deduplicated

counts_dedup_bt <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv"
  )

counts_nodedup_bt <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv"
  )

counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)

counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")

Format data

#normalise by both raw read count and genome length - should be the same as mean read depth

counts_reads_norm <- 
  counts_reads |>
  mutate(norm_counts1 = matched / QC_reads) |>
  mutate(norm_counts2 = matched / QC_reads / length) |>
  mutate(norm_counts3 = matched / length) |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  .default = "RNA"
  ))

lengths <- 
  counts_reads_norm |>
  select(virus, length) |>
  distinct()

#calculate genome coverage

persite_coverage_dedup <- 
  dedup_per_site |>
  rename(Viral.load = `Viral load`) |>
  group_by(Sample_id, virus, Viral.load, Background, type) |>
  filter(coverage > 0) |>
  summarise(genome_coverage = n()) |>
  left_join(lengths) |>
  mutate(percent_coverage = genome_coverage / length)

persite_coverage_nodedup <- 
  nodedup_per_site |>
  rename(Viral.load = `Viral load`) |>
  group_by(Sample_id, virus, Viral.load, Background, type) |>
  filter(coverage > 0) |>
  summarise(genome_coverage = n()) |>
  left_join(lengths) |>
  mutate(percent_coverage = genome_coverage / length)

persite_coverage_both <-
  rbind(persite_coverage_dedup, 
        persite_coverage_nodedup)

coverage_labels <- c("0" = "Control",
                     "100" = "10^{2}",
                     "1000" = "10^{3}",
                     "10000" = "10^{5}",
                     "dedup_TE" = "Deduplicated",
                     "nodedup_TE"= "Non-Deduplicated")

coverage_labels2 <- c("0" = "Control",
                      "100" = "10^{2}",
                      "1000" = "10^{3}",
                      "10000" = "10^{5}")

Make plots

Final plot

——————–

——————–

Figure S4

Final plot

ggsave("figures/manuscript_figure_2025/PDF/Figure_S4.pdf",
       width=15,
       height=10)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S4.png",
       width=15,
       height=10)

——————–

Figure S5

####Final plot

file.remove("figures/manuscript_figure_2025/PDF/FigureS6.pdf")
[1] TRUE

——————–

Figure S6

####Final plot

file.remove("figures/manuscript_figure_2025/PNG/Figure_S6.pdf")
[1] TRUE

——————–

Figure S7

HBV <- 
persite_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  filter(virus == "Human_betaherpesvirus") |>
  rename(Viral.load = `Viral load`) |>
  ggplot(aes(x = site, y = coverage, fill = Background)) +
  geom_col() +
  facet_wrap(Viral.load ~ Sample_id, label = label_parsed) +
  ylab("Coverage") +
  ggtitle("Human Betaherpesvirus (deduplicated)") +
  guides(fill = guide_legend(title = "Background")) +
 theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_y_log10() +
  scale_fill_manual(values = cols2, labels = c("P1", "P2"))

####Final plot

file.remove("figures/manuscript_figure_2025/PNG/Figure_S7.pdf")
[1] TRUE

——————–

Figure S8

#have to do these differently due to segmentation


FLU_ME1 <- persite_norm |>
  filter(virus == "Influenza_B_virus") |>
  filter(type == "dedup_TE") |>
  filter(Background == "p2") |>
  rename(Viral.load = `Viral load`) |>
  ggplot(aes(
    x = site,
    y = coverage,
    fill = as.character(Viral.load)
  )) +
  geom_col() +
  facet_grid(seg ~ Sample_id) +
  ylab("Coverage") +
  ggtitle("Influenza B virus (Background 1 deduplicated)") +
  guides(fill = guide_legend(title = "Viral load")) +
 theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_y_log10() +
  scale_fill_manual(values = cols4, label = c(expression("10"^"2"), expression("10"^"3"), expression("10"^"5")))

FLU_ME2 <- persite_norm |>
  filter(virus == "Influenza_B_virus") |>
  filter(type == "dedup_TE") |>
  filter(Background == "p2") |>
  rename(Viral.load = `Viral load`) |>
  ggplot(aes(
    x = site,
    y = coverage,
    fill = as.character(Viral.load)
  )) +
  geom_col() +
  facet_grid(seg ~ Sample_id) +
  ylab("Coverage") +
  ggtitle("Influenza B virus (Background 2 deduplicated)") +
  guides(fill = guide_legend(title = "Viral load")) +
 theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_y_log10() +
  scale_fill_manual(values = cols4, label = c(expression("10"^"2"), expression("10"^"3"), expression("10"^"5")))


ggarrange(FLU_ME1, FLU_ME2, ncol = 2, common.legend = TRUE)

####Final plot


ggsave("figures/manuscript_figure_2025/PDF/Figure_S8.pdf",
       width=20,
       height=12,
       dpi=600)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S8.png",
       width=20,
       height=12)

——————–

Figure S9


REO_ME1 <- 
  persite_norm |>
  filter(virus == "Mammalian_orthoreovirus3") |>
  filter(type == "dedup_TE") |>
  filter(Background == "p2") |>
  rename(Viral.load = `Viral load`) |>
  ggplot(aes(
    x = site,
    y = coverage,
    fill = as.character(Viral.load)
  )) +
  geom_col() +
  facet_grid(seg ~ Sample_id) +
  ylab("log10(Coverage)") +
  ggtitle("Mammalian orthoreovirus 3 (Background 1 deduplicated)") +
  guides(fill = guide_legend(title = "Viral load")) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top",
    panel.grid.major = element_line(linewidth = 0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth = 0.1, color = "lightgray")
  ) +
  scale_y_log10() +
  scale_fill_manual(values = cols4,
                    label = c(
                      expression("10" ^ "2"),
                      expression("10" ^ "3"),
                      expression("10" ^ "5")
                    ))

REO_ME2 <- 
  persite_norm |>
  filter(virus == "Mammalian_orthoreovirus3") |>
  filter(type == "dedup_TE") |>
  filter(Background == "p8") |>
  rename(Viral.load = `Viral load`) |>
  ggplot(aes(
    x = site,
    y = coverage,
    fill = as.character(Viral.load)
  )) +
  geom_col() +
  facet_grid(seg ~ Sample_id) +
  ylab("log10(Coverage)") +
  ggtitle("Mammalian orthoreovirus 3 (Background 2 deduplicated)") +
  guides(fill = guide_legend(title = "Viral load")) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top",
    panel.grid.major = element_line(linewidth = 0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth = 0.1, color = "lightgray")
  ) + scale_y_log10() +
  scale_fill_manual(values = cols4,
                    label = c(
                      expression("10" ^ "2"),
                      expression("10" ^ "3"),
                      expression("10" ^ "5")
                    ))

ggarrange(REO_ME1, REO_ME2, ncol = 2, common.legend = TRUE)

####Final plot

ggsave("figures/manuscript_figure_2025/PDF/Figure_S9.pdf",
       width=20,
       height=12,
       dpi=600)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S9.png",
       width=20,
       height=12,
       dpi=600)

——————–

>>>> Figure 3: all viruses - target and off target

Read-in data

metadata2 <-
  metadata |>
  select(
    Sample.ID,
    Background.sample,
    Viral.load,
    Number.of.read.pairs..quality.adaptor.trimmed.
  ) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)
Error in rename(rename(select(metadata, Sample.ID, Background.sample,  : 
  could not find function "rename"

Format data

Make plots

Dedup – viral reads, prop all reads, and prop viral reads

Final plot

—-nodedup results —-

——————–

---
title: "target_offtarget"
output: html_notebook
editor_options: 
  chunk_output_type: inline
---

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. 

Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*. 

```{r}
##all viruses - target and off target

library(tidyverse)
library(ggpubr)
library(tidyplots)
library(ggthemes)
library(scales)

```

### Figure 1 - linear model

#### Read-in data
```{r}
#Linear model for TE data - separated by virus
#November 2024

#viral read depth
#use bowtie2 data

library(tidyverse)
library(broom)
library(boot)

####read counts/normalised read counts####

#metadata

metadata <-
  read.csv(
    "metadata/sampleIDs_TESpikeIn.csv",
    header = TRUE
  )

metadata2 <- metadata |>
  select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import and combine read count files
#deduplicated and non-deduplicated

counts_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)

counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")

####read depths####

#import and combine read depth files

depth_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

depth_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

```

#### Format data
```{r}

depths_bt_all <- rbind(depth_dedup_bt, depth_nodedup_bt)

depths_reads <- left_join(depths_bt_all, metadata2, by = "Sample_id") |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  ,
  .default = "RNA"
  ))

####linear model for spike in viruses - mean read depth####

depths_reads_sub <- depths_reads |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  mutate(log_depth = log10(mean_depth)) |>
  mutate(log_viral_load = log10(Viral.load))

#inspect data
hist(depths_reads_sub$mean_depth)
hist(depths_reads_sub$Viral.load)
hist(depths_reads_sub$log_depth)
hist(depths_reads_sub$log_viral_load)

summary(depths_reads_sub$log_depth)
summary(depths_reads_sub$log_viral_load)

```

#### model
```{r}
#all viruses together

lm1 <-
  glm(log_depth ~ log_viral_load * virus,
      data = depths_reads_sub,
      family = "gaussian")

summary(lm1)

glm.diag.plots(lm1)

pred <- predict(lm1, type = "response")

rsq <- function (x, y) {
  cor(x, y) ^ 2
}

rsq(depths_reads_sub$log_depth, pred)

##split by viruses

cols <- c("#4477AA",
          "#66CCEE",
          "#228833",
          "#CCBB44",
          "#EE6677",
          "#AA3377")

facet_names <- c(
  "Human_adenovirus_40" = "Human adenovirus 40",
  "Human_betaherpesvirus" = "Human betaherpesvirus",
  "Human_respiratory_syncytial_virus" = "Human respiratory syncytial virus",
  "Influenza_B_virus" = "Influenza B virus",
  "Mammalian_orthoreovirus3" = "Mammalian orthoreovirus 3",
  "Zika_virus" = "Zika virus"
)

virus <- (unique(depths_reads_sub$virus)) %>%
  rep(., each = 2) %>%
  data.frame()

list_models <- 
  depths_reads_sub |>
  group_split(virus) |>
  map( ~ lm(log_depth ~ log_viral_load, data = .))

lm_tidy <- 
  map(list_models, broom::tidy) %>%
  do.call(rbind.data.frame, .) %>%
  cbind(virus, .) %>%
  rename(virus = ".") %>%
  select(virus, term, estimate) %>%
  pivot_wider(names_from = "term", values_from = "estimate")

lm_summary <- depths_reads_sub |>
  group_by(virus) |>
  summarise(
    Intercept = lm(log_depth ~ log_viral_load)$coefficients[1],
    Coeff_x1 = lm(log_depth ~ log_viral_load)$coefficients[2],
    R2 = summary(lm(log_depth ~ log_viral_load))$r.squared,
    pvalue = summary(lm(log_depth ~ log_viral_load))$coefficients["log_viral_load", 4]
  )

lm_combined <- 
  left_join(lm_tidy, lm_summary, by = "virus")


```

#### Make plot
```{r, message=FALSE, warning=FALSE, echo=FALSE}
ggplot() +
  geom_point(data = depths_reads_sub,
             mapping = aes(Viral.load, mean_depth, color = virus)) +
  geom_abline(data = lm_combined, aes(intercept = `(Intercept)`, slope = log_viral_load), linetype="dotted") +
  geom_text(data = lm_combined,
            colour = "black",
            aes(
              x = 6*10000,
              y = 2,
              label = round(Coeff_x1, digits = 2)),
            size = 3.5) +
  geom_text(data = lm_combined,
            colour = "black",
            aes(x = 1.5*10000, 
                y = 2, 
                label = "Slope ="),
            size = 3.5) +
  geom_text(data = lm_combined,
            colour = "black",
            aes(
              x = 6*10000,
              y = 6,
              label = round(R2, digits = 2)),
            size = 3.5) +
  geom_text(data = lm_combined,
            colour = "black",
            aes(x = 2*10000, 
                y = 6, 
                label = "R2 ="),
            size = 3.5) +
  facet_wrap( ~ virus, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray"),
    axis.title.y = element_text(size = 10),
    legend.title = element_blank()
  ) +
    xlab("Log (Viral load)") + 
  ylab("Log (Mean Read Depth)") +
  scale_color_manual(values = cols) +
  guides(color = FALSE) +
  scale_x_log10(labels = scales::trans_format("log10", scales::label_math())) + 
    scale_y_log10(labels = scales::trans_format("log10", scales::label_math()))

```

#### Final plot
```{r}
#ggsave("figures/compare_spike_ins_atcc/model_readdepth_dedup.png")

ggsave("figures/manuscript_figure_2025/PDF/Figure_1.pdf",
       width = 8,
       height = 5)

ggsave("figures/manuscript_figure_2025/PNG/Figure_1.png",
       width = 8,
       height = 5)

```

### --------------------

## >>>> now redundant? - Figure 1: plots of TE experiment data - read counts and read depths 

### Read-in data
```{r}
#ATCC genomes updated version
#October 2024

#viral load vs read count normalised using different methods, comparing deduplicated and non-deduplicated
#use bowtie2 data

####read counts/normalised read counts####

#metadata

metadata <- read.csv("metadata/sampleIDs_TESpikeIn.csv", header = TRUE)

metadata2 <- metadata |>
  select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import and combine read count files
#deduplicated and non-deduplicated

counts_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)

counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")

#normalise by both raw read count and genome length - should be the same as mean read depth

counts_reads_norm <- counts_reads |>
  mutate(norm_counts1 = matched / QC_reads) |>
  mutate(norm_counts2 = matched / QC_reads / length) |>
  mutate(norm_counts3 = matched / length) |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  .default = "RNA"
  ))

```


### Format data
```{r}
#change labels in facet plots

facet_names <- c("dedup_TE" = "Deduplicated",
                 "nodedup_TE" = "Non-Deduplicated")

cols <- c("#4477AA",
          "#66CCEE",
          "#228833",
          "#CCBB44",
          "#EE6677",
          "#AA3377")
```

### Make plots
```{r}

#plot viral read counts (non normalised)

read_count <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(matched),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  ylab("log10(Viral Reads)") +
  xlab("Log (Viral load)") + 
  scale_x_log10(labels = scales::trans_format("log10", scales::label_math()))

#ggsave("figures/compare_spike_ins_atcc/read_counts.pdf",width=8,height=6)

#normalise by raw read count

read_count_norm1 <- counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(norm_counts1),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  ylab("log10(viral reads/cleaned reads)") +
  xlab("Log (Viral load)") + 
  scale_x_log10(labels = scales::trans_format("log10", scales::label_math()))

#normalise by raw read count and genome length

read_count_norm2 <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = (Viral.load),
    y = log10(norm_counts2),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  ylab("Log (viral reads/cleaned reads/genome length)") +
  xlab("Log (Viral load)") + 
  scale_x_log10(labels = scales::trans_format("log10", scales::label_math()))

```

### plot just the two lefthand panels from the original figure
```{r, warning=FALSE, echo=FALSE, message=FALSE, fig.height=10, fig.width=8}

ggarrange(read_count,
          read_count_norm2,
          nrow = 2,
          common.legend = TRUE)

#ggsave("figures/compare_spike_ins_atcc/log_read_count_depth_2panels.png",width=10,height=7)


```

### plot viral read counts (non normalised)
```{r}

##deduplicated only

read_count_dedup <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(matched),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE, method = "lm") +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  ylab("Log (Viral Reads)") +
  xlab("Log (Viral load)") + 
  scale_x_log10(labels = scales::trans_format("log10", scales::label_math()))

#normalise by raw read count and genome length

read_count_norm2_dedup <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(norm_counts2),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE, method = "lm") +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  ylab("Log (viral reads/cleaned reads/genome length)") +
  xlab("Log (Viral load)") + 
  scale_x_log10(labels = scales::trans_format("log10", scales::label_math()))
```

### ***Final plot***
```{r, warning=FALSE, message=FALSE, echo=FALSE, fig.width=8, fig.height=10}
#plot just the two lefthand panels from the original figure

ggarrange(
  read_count_dedup,
  read_count_norm2_dedup,
  nrow = 2,
  common.legend = TRUE,
  align = "hv"
)

# ggsave("figures/compare_spike_ins_atcc/log_read_count_depth_2panels_dedup.png",
#        width=10,
#        height=7)

ggsave("figures/manuscript_figure_2025/PDF/Figure_S1.pdf",
       width=7,
       height=8)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S1.png",
       width=7,
       height=8)

```


### --------------------


## Figure S1

### Read depths
```{r}
####read depths####

#import and combine read depth files

depth_dedup_bt <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv"
  )

depth_nodedup_bt <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv"
  )

depths_bt_all <- rbind(depth_dedup_bt, depth_nodedup_bt)

depths_reads <- 
  depths_bt_all |>
  left_join(metadata2, by = "Sample_id") |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  .default = "RNA"
  )) |>
  rename(Viral.load = `Viral load`)

```

### Make plots
```{r}

read_depths <- 
  depths_reads |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(mean_depth),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  ylab("Log (mean read depth)") +
  xlab("Log (Viral load)") + 
  scale_x_log10(labels = scales::trans_format("log10", scales::label_math()))

#ggsave("figures/compare_spike_ins_atcc/mean_depth.pdf",width=8,height=6)

```

### ***Final plot***
```{r, warning = FALSE, echo = FALSE, message = FALSE}

ggarrange(
  read_count,
  read_count_norm1,
  read_count_norm2,
  read_depths,
  nrow = 2,
  ncol = 2,
  common.legend = TRUE,
  align = "hv"
)

#ggsave("figures/compare_spike_ins_atcc/log_read_count_depth.png",width=10,height=7)

#ggsave("figures/manuscript_figures_pdf/FigureS1.pdf",width=10,height=7)

#apparently normalised read count using method 2 should not actually be the same

ggsave("figures/manuscript_figure_2025/PDF/Figure_S1.pdf",
       width = 12,
       height = 8)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S1.png",
       width = 12,
       height = 8)


```


### --------------------

## Figure S2

### Read-in data / Make plots
```{r}
#metadata

metadata <-
  read.csv(
    "metadata/sampleIDs_TESpikeIn.csv",
    header = TRUE
  )

metadata2 <- metadata |>
  select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import and combine read count files
#deduplicated and non-deduplicated

counts_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)

counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")

#normalise by both raw read count and genome length - should be the same as mean read depth

counts_reads_norm <- counts_reads |>
  mutate(norm_counts1 = matched / QC_reads) |>
  mutate(norm_counts2 = matched / QC_reads / length) |>
  mutate(norm_counts3 = matched / length) |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  .default = "RNA"
  ))

####compare library capture pool####

cols3 <- c("#228833", "#AA3377")

facet_names <- c("dedup_TE" = "Deduplicated",
                 "nodedup_TE" = "Non-Deduplicated")

metadata3 <- 
  metadata |>
  select(Sample.ID, Pool.for.sequencing) |>
  rename(Sample_id = Sample.ID) |>
  rename(Pool = Pool.for.sequencing)

counts_pool <- left_join(counts_reads_norm, metadata3, by = "Sample_id")

```

### Make plots
```{r}

pool_counts_sum <- 
  counts_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = as.character(Viral.load),
    y = log10(matched),
    colour = Pool
  )) +
  geom_boxplot() +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    legend.title = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  ylab("Log (Viral Reads)") + xlab("Log (Viral load)") +
  scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2")) + 
  scale_x_discrete(labels = c(bquote(10^{2}), bquote(10^{3}), bquote(10^{5})))
  

# pool_readcounts_norm <- 
#   counts_pool |>
#   filter(Background != "p6") |>
#   filter(Background != "control") |>
#   ggplot(aes(x = virus, y = log10(norm_counts1))) +
#   geom_boxplot() +
#   facet_grid(Pool ~ type) +
#   theme_few() +
#   theme(
#     axis.title.x = element_blank(),
#     # axis.text.x = element_blank(),
#     axis.title.y = element_text(size = 10),
#     panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
#     panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
#   ) +
#   ylab("log10(viral reads/cleaned reads)")

pool_norm1_sum <- 
  counts_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = as.character(Viral.load),
    y = log10(matched),
    colour = Pool
  )) +
  geom_boxplot() +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    legend.title = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray") 
  ) +
  ylab("log10(viral reads/cleaned reads)") + xlab("Log (Viral load)") +
  scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2")) + 
    scale_x_discrete(labels = c(bquote(10^{2}), bquote(10^{3}), bquote(10^{5})))

# pool_readcounts_norm2 <- 
#   counts_pool |>
#   filter(Background != "p6") |>
#   filter(Background != "control") |>
#   ggplot(aes(x = virus, y = log10(norm_counts2))) +
#   geom_boxplot() +
#   facet_grid(Pool ~ type) +
#   theme_few() +
#   theme(
#     axis.title.x = element_blank(),
#     # axis.text.x = element_blank(),
#     axis.title.y = element_text(size = 10),
#     panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
#     panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
#   ) +
#   ylab("log10(viral reads/cleaned reads/genome length)")


pool_norm2_sum <- 
  counts_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = as.character(Viral.load),
    y = log10(matched),
    colour = Pool
  )) +
  geom_boxplot() +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    legend.title = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  ylab("log10(viral reads/cleaned reads/genome length)") + xlab("Log (Viral load)") +
  scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2")) + 
    scale_x_discrete(labels = c(bquote(10^{2}), bquote(10^{3}), bquote(10^{5})))


#import and combine read depth files

depth_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

depth_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

depths_bt_all <- rbind(depth_dedup_bt, depth_nodedup_bt)

depths_reads <- 
  left_join(depths_bt_all, metadata2, by = "Sample_id") |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  .default = "RNA"
  ))

depths_pool <- left_join(depths_reads, metadata3, by = "Sample_id")

# pool_readdepths <- 
#   depths_pool |>
#   filter(Background != "p6") |>
#   filter(Background != "control") |>
#   ggplot(aes(x = virus, y = log10(mean_depth))) +
#   geom_boxplot() +
#   facet_grid(Pool ~ type) +
#   theme_few() +
#   theme(
#     axis.title.x = element_blank(),
#     axis.text.x = element_text(angle = 45, hjust = 1),
#     axis.title.y = element_text(size = 10),
#     panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
#     panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
#   ) +
#   ylab("Log (mean read depth)")

pool_depths_sum <-
  depths_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = as.character(Viral.load),
    y = log10(mean_depth),
    colour = Pool
  )) +
  geom_boxplot() +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    legend.title = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  ylab("Log (mean read depth)") +
  xlab("Log (Viral load)") + 
  scale_x_discrete(labels = c(bquote(10^{2}), bquote(10^{3}), bquote(10^{5}))) +
  scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2"))


```

### ***Final plot***
```{r}
ggarrange(
  pool_counts_sum,
  pool_norm1_sum,
  pool_norm2_sum,
  pool_depths_sum,
  nrow = 2,
  ncol = 2,
  common.legend = TRUE, 
  align = "hv"
)


#ggsave("figures/compare_spike_ins_atcc/pool_compare_viruses_sum.png",width=10,height=7)

ggsave("figures/manuscript_figure_2025/PDF/Figure_S2.pdf",
       width = 10,
       height = 7)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S2.png",
       width = 10,
       height = 7)

```

### --------------------

## Figure S3
```{r}

##plots of read counts and viral read counts
#November 2024

#metadata

metadata <-
  read.csv(
    "metadata/sampleIDs_TESpikeIn.csv",
    header = TRUE
  )

metadata2 <- metadata |>
  select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import read count files

counts_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

#import viral reads mapped (to calculate proportions)

viral_reads_dedup <-
  read.csv(
    "data_TE/total_virus_mapped_reads_per_sample_dedup_atcc_ref_20241108.csv",
    header = TRUE
  )

viral_reads_nodedup <-
  read.csv(
    "data_TE/total_virus_mapped_reads_per_sample_nodedup_atcc_ref_20241108.csv",
    header = TRUE
  )

cols2 <- c("#BB5566", "#004488")

facet_names <- c("dedup_TE" = "Deduplicated",
                 "nodedup_TE" = "Non-Deduplicated")

reads_metadata_dedup <-
  left_join(counts_dedup_bt, metadata2, by = "Sample_id")

reads_viral_dedup <-
  left_join(reads_metadata_dedup, viral_reads_dedup, by = "Sample_id")

reads_metadata_nodedup <-
  left_join(counts_nodedup_bt, metadata2, by = "Sample_id")

reads_viral_nodedup <-
  left_join(reads_metadata_nodedup, viral_reads_nodedup, by = "Sample_id")

reads_viral_all <- rbind(reads_viral_dedup, reads_viral_nodedup)


reads_plot <- 
  reads_viral_all |>
  group_by(Background, Sample_id, Viral.load, type) |>
  summarise(
    total_reads = (QC_reads * 2),
    viral_reads = total_virus_reads,
    ATCC_reads = sum(matched),
    prop_ATCC = ATCC_reads / total_reads,
    prop_viral = total_virus_reads / total_reads,
    diff = viral_reads - ATCC_reads
  ) |>
  unique()

```

### Make plots
```{r, echo=FALSE, warning=FALSE, message=FALSE}

total_reads <- 
  reads_plot |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = (total_reads),
    colour = Background
  )) +
  geom_point() +
  geom_smooth(aes(group = Background), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_y_log10(labels = scales::trans_format("log10", scales::label_math())) + 
    scale_x_log10(labels = scales::trans_format("log10", scales::label_math())) + xlab("Viral load") +
  # ggtitle("All") +
  ylab("Total number of reads") +
  scale_color_manual(values = cols2, labels = c("P1", "P2"))

viral_reads <- 
  reads_plot |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = (viral_reads),
    colour = Background
  )) +
  geom_point() +
  geom_smooth(aes(group = Background), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_y_log10(labels = scales::trans_format("log10", scales::label_math())) + 
    scale_x_log10(labels = scales::trans_format("log10", scales::label_math())) + xlab("Viral load") +
  # ggtitle("Viral reads") +
  ylab("Number of viral reads") +
  scale_color_manual(values = cols2, labels = c("P1", "P2"))

ATCC_reads <- 
  reads_plot |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = (ATCC_reads),
    colour = Background
  )) +
  geom_point() +
  geom_smooth(aes(group = Background), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_y_log10(labels = scales::trans_format("log10", scales::label_math())) + 
    scale_x_log10(labels = scales::trans_format("log10", scales::label_math())) + xlab("Viral load") +
  ggtitle("Spike-in Viral Reads") +
  ylab("Spike-in Viral Reads") +
  scale_color_manual(values = cols2, labels = c("P1", "P2"))

prop_ATCC <- reads_plot |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(x = Viral.load, y = prop_ATCC, colour = Background)) +
  geom_point() +
  geom_smooth(aes(group = Background), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
    scale_x_log10(labels = scales::trans_format("log10", scales::label_math())) + xlab("Viral load") +
  ggtitle("Spike-in viral reads") +
  ylab("Proportion") +
  scale_color_manual(values = cols2, labels = c("P1", "P2"))

prop_viral <- reads_plot |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(x = Viral.load, y = prop_viral, colour = Background)) +
  geom_point() +
  geom_smooth(aes(group = Background), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
 scale_x_log10(labels = scales::trans_format("log10", scales::label_math())) + xlab("Viral load") +
  # ggtitle("Viral reads") +
  ylab("Proportion of viral reads") +
  scale_color_manual(values = cols2, labels = c("P1", "P2")) + scale_y_continuous(limits=c(0,1.0))


#ggarrange(total_reads,viral_reads,ATCC_reads,prop_viral,prop_ATCC,nrow=2,ncol=3,common.legend = TRUE)

```

### ***Final plot***
```{r, warning=FALSE, echo=FALSE, message=FALSE}

ggarrange(total_reads,
          viral_reads,
          prop_viral,
          nrow = 3,
          common.legend = TRUE,
          align = "hv")

#ggsave("figures/compare_spike_ins_atcc/backgrounds_reads.png")

ggsave("figures/manuscript_figure_2025/PDF/Figure_S3.pdf", 
       width = 8,
       height = 8)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S3.png", 
       width = 8,
       height = 8)

####compare proportion viral reads per pool with shotgun####

# metadata3 <- metadata |>
#   select(Sample.ID,
#          Number.of.read.pairs..quality.adaptor.trimmed.,
#          Pool.for.sequencing) |>
#   rename(Sample_id = Sample.ID) |>
#   rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)
# 
# reads_viral_dedup3 <-
#   full_join(metadata3, viral_reads_dedup, by = "Sample_id")
# 
# reads_viral_nodedup3 <-
#   full_join(metadata3, viral_reads_nodedup, by = "Sample_id")
# 
# reads_viral_dedup3 |>
#   group_by(Pool.for.sequencing) |>
#   summarise(
#     viral_reads = sum(total_virus_reads),
#     total_reads = sum(QC_reads * 2),
#     prop_viral = viral_reads / total_reads
#   )
# 
# polyomics <-
#   read.csv(
#     "data_polyomics/total_virus_mapped_reads_per_sample_dedup.csv"
#   )
# 
# #read data from polyomics_indexes_stefano document
# total_reads <- data.frame(total_reads = c(5942760, 7714275))
# 
# keeps <- c("RNA-Msp-p2", "RNA-Msp-p8")
# 
# polyomics_read_prop <- polyomics |>
#   filter(Sample_id %in% keeps)
# 
# polyomics_read_prop2 <- cbind(polyomics_read_prop, total_reads)
# 
# polyomics_read_prop2 |>
#   summarise(prop_viral = total_virus_reads / total_reads)

```

### --------------------

## No longer required? Figure S4

### Make plots
```{r}
####read counts/depths split by background####

#change labels in facet plots

facet_names_bg <- c(
  "dedup_TE" = "Deduplicated",
  "nodedup_TE" = "Non-Deduplicated",
  "p2" = "P1",
  "p8" = "P2"
)

#break down by Background x virus

background_counts_reads <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(matched),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels =
      c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
      )
  ) +
  scale_x_log10(labels = scales::label_log10()) +
  ylab("log10(Viral Reads)")

backgrounds_counts_reads_norm <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(norm_counts1),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
  theme_bw() +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  scale_x_log10(label = scales::label_log10()) +
  ylab("log10(viral reads/cleaned reads)")

backgrounds_counts_reads_norm2 <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(norm_counts2),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  scale_x_log10(labels = scales::label_log10()) +
  ylab("log10(viral reads/cleaned reads/genome length)")

background_read_depths <- 
  depths_reads |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(mean_depth),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray"),
    axis.title.y = element_text(size = 10),
    legend.title = element_blank()
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  scale_x_log10(label = scales::label_log10()) +
  ylab("log10(mean read depth)")

```

### ***Final plot*** 
```{r, warning=FALSE, message=FALSE, echo=FALSE, fig.height=6, fig.height=8}

ggarrange(
  background_counts_reads,
  backgrounds_counts_reads_norm,
  backgrounds_counts_reads_norm2,
  background_read_depths,
  nrow = 2,
  ncol = 2,
  common.legend = TRUE,
  align = "hv"
)

#ggsave("figures/compare_spike_ins_atcc/backgrounds_read_count_depth.png",width=10,height=7)

#ggsave("figures/manuscript_figures_pdf/FigureS4.pdf",width=10,height=7)

```


### --------------------
### --------------------

## >>> Figure 2: plots of TE experiment data - genome coverage & per site coverage

### Read-in data
```{r}

#plots of TE experiment data - genome coverage & per site coverage
#also separated by viruses
#ATCC genomes updated version
#October 2024

#use bowtie2 data

####genome coverage####

unzip(zipfile = "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv.zip", exdir = "data_TE/")

unzip(zipfile = "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv.zip", exdir = "data_TE/")

dedup_per_site <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv"
  )


nodedup_per_site <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv"
  )

file.remove("data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv")

file.remove( "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv")

#add length column from read depth file to per site file

#metadata

metadata <-
  read.csv(
    "metadata/sampleIDs_TESpikeIn.csv",
    header = TRUE
  )

metadata2 <- 
  metadata |>
  select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import and combine read count files

#deduplicated and non-deduplicated

counts_dedup_bt <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv"
  )

counts_nodedup_bt <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv"
  )

counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)

counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")

```

### Format data
```{r}
#normalise by both raw read count and genome length - should be the same as mean read depth

counts_reads_norm <- 
  counts_reads |>
  mutate(norm_counts1 = matched / QC_reads) |>
  mutate(norm_counts2 = matched / QC_reads / length) |>
  mutate(norm_counts3 = matched / length) |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  .default = "RNA"
  ))

lengths <- 
  counts_reads_norm |>
  select(virus, length) |>
  distinct()

#calculate genome coverage

persite_coverage_dedup <- 
  dedup_per_site |>
  rename(Viral.load = `Viral load`) |>
  group_by(Sample_id, virus, Viral.load, Background, type) |>
  filter(coverage > 0) |>
  summarise(genome_coverage = n()) |>
  left_join(lengths) |>
  mutate(percent_coverage = genome_coverage / length)

persite_coverage_nodedup <- 
  nodedup_per_site |>
  rename(Viral.load = `Viral load`) |>
  group_by(Sample_id, virus, Viral.load, Background, type) |>
  filter(coverage > 0) |>
  summarise(genome_coverage = n()) |>
  left_join(lengths) |>
  mutate(percent_coverage = genome_coverage / length)

persite_coverage_both <-
  rbind(persite_coverage_dedup, 
        persite_coverage_nodedup)

coverage_labels <- c("0" = "Control",
                     "100" = "10^{2}",
                     "1000" = "10^{3}",
                     "10000" = "10^{5}",
                     "dedup_TE" = "Deduplicated",
                     "nodedup_TE"= "Non-Deduplicated")

coverage_labels2 <- c("0" = "Control",
                      "100" = "10^{2}",
                      "1000" = "10^{3}",
                      "10000" = "10^{5}")
```

### Make plots
```{r, echo = FALSE, warning=FALSE, message=FALSE, fig.height=4, fig.width=6}

cols2 <- c("#BB5566",
           "#004488")

cols4 <- c("#DDAA33",
           "#BB5566",
           "#004488")


persite_coverage_both$Viral.load <- factor(persite_coverage_both$Viral.load, labels = c("0",
    "10^{2}", 
    "10^{3}", 
    "10^{5}"
    ))


##are the deduplicated and non deduplicated datasets identical??
##if they are the same can maybe just show one
#show deduplicated

persite_coverage_both |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  ggplot(aes(x = virus, y = percent_coverage, colour = Background)) +
  geom_boxplot(outlier.shape = NA) +
  geom_point(position = position_dodge(width = .75)) +
  facet_grid(~as.character(Viral.load), labeller = label_parsed) +
  theme_few() +
  ylab("Genome coverage") +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(values = cols2, labels = c("P1", "P2")) +
  scale_x_discrete(
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  scale_y_continuous(limits = c(0, 1))

```

### ***Final plot***
```{r, echo = FALSE, warning=FALSE, message=FALSE}
#ggsave("figures/compare_spike_ins_atcc/genome_cov_deduponly.png",width=15,height=6)

ggsave(
  "figures/manuscript_figure_2025/PDF/Figure_2.pdf",
  width = 15,
  height = 6
)

ggsave(
  "figures/manuscript_figure_2025/PNG/Figure_2.png",
  width = 15,
  height = 6
)


```


### --------------------


### --------------------

### Figure S4
```{r}

persite_both <- rbind(dedup_per_site, nodedup_per_site)

persite_reads <- left_join(persite_both, metadata2, by = "Sample_id")

persite_norm <- persite_reads |>
  mutate(norm_cov = coverage / QC_reads)

coverage_labels3 <-c("100" = "1e+02",
                    "1000" = "1e+03",
                    "100000" = "1e+05",
                    "p2" = "P1",
                    "p8"= "P2")

persite_norm$`Viral load` <- 
  factor(persite_norm$`Viral load`, 
         labels = c("0",
                    "10^{2}", 
                    "10^{3}",
                    "10^{5}"))


persite_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  filter(virus == "Human_respiratory_syncytial_virus") |>
  rename(Viral.load = `Viral load`) |>
  ggplot(aes(x = site, y = coverage, fill = Background)) +
  geom_col() +
  facet_wrap(Viral.load ~ Sample_id, label = label_parsed) +
  ylab("Coverage") +
  ggtitle("Human RSV (deduplicated)") +
  guides(fill = guide_legend(title = "Background")) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_y_log10() +
  scale_fill_manual(values = cols2, labels = c("P1", "P2"))
```

#### ***Final plot***
```{r}
ggsave("figures/manuscript_figure_2025/PDF/Figure_S4.pdf",
       width=15,
       height=10)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S4.png",
       width=15,
       height=10)
```


### --------------------

### Figure S5
```{r, warning=FALSE, message=FALSE, echo=FALSE}

persite_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  filter(virus == "Zika_virus") |>
  rename(Viral.load = `Viral load`) |>
  ggplot(aes(x = site, y = coverage, fill = Background)) +
  geom_col() +
  facet_wrap(Viral.load ~ Sample_id, label = label_parsed) +
  ylab("Coverage") +
  ggtitle("Zika virus (deduplicated)") +
  guides(fill = guide_legend(title = "Background")) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_y_log10() +
  scale_fill_manual(values = cols2, labels = c("P1", "P2"))

```

####***Final plot***
```{r}

ggsave("figures/manuscript_figure_2025/PDF/Figure_S5.pdf",
       width=15,
       height=10)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S5.png",
       width=15,
       height=10)

```


### --------------------

### Figure S6
```{r, warning=FALSE, message=FALSE, echo=FALSE}

persite_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  filter(virus == "Human_adenovirus_40") |>
  rename(Viral.load = `Viral load`) |>
  ggplot(aes(x = site, y = coverage, fill = Background)) +
  geom_col() +
  facet_wrap(Viral.load ~ Sample_id, label = label_parsed) +
  ylab("Coverage") +
  ggtitle("Human Adenovirus (deduplicated)") +
  guides(fill = guide_legend(title = "Background")) +
 theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_y_log10() +
  scale_fill_manual(values = cols2, labels = c("P1", "P2"))

```

####***Final plot***
```{r}

ggsave("figures/manuscript_figure_2025/PDF/Figure_S6.pdf",
       width=15,
       height=10)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S6.png",
       width=15,
       height=10)

```


### --------------------

### Figure S7
```{r}
HBV <- 
persite_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  filter(virus == "Human_betaherpesvirus") |>
  rename(Viral.load = `Viral load`) |>
  ggplot(aes(x = site, y = coverage, fill = Background)) +
  geom_col() +
  facet_wrap(Viral.load ~ Sample_id, label = label_parsed) +
  ylab("Coverage") +
  ggtitle("Human Betaherpesvirus (deduplicated)") +
  guides(fill = guide_legend(title = "Background")) +
 theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_y_log10() +
  scale_fill_manual(values = cols2, labels = c("P1", "P2"))

```

####***Final plot***
```{r}

ggsave(plot = HBV, file="figures/manuscript_figure_2025/PDF/Figure_S7.pdf",
       width=15,
       height=10,
       dpi = 600)

ggsave(plot = HBV, file="figures/manuscript_figure_2025/PNG/Figure_S7.png",
       width=15,
       height=10)


```



### --------------------

### Figure S8
```{r}
#have to do these differently due to segmentation


FLU_ME1 <- persite_norm |>
  filter(virus == "Influenza_B_virus") |>
  filter(type == "dedup_TE") |>
  filter(Background == "p2") |>
  rename(Viral.load = `Viral load`) |>
  ggplot(aes(
    x = site,
    y = coverage,
    fill = as.character(Viral.load)
  )) +
  geom_col() +
  facet_grid(seg ~ Sample_id) +
  ylab("Coverage") +
  ggtitle("Influenza B virus (Background 1 deduplicated)") +
  guides(fill = guide_legend(title = "Viral load")) +
 theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_y_log10() +
  scale_fill_manual(values = cols4, label = c(expression("10"^"2"), expression("10"^"3"), expression("10"^"5")))

FLU_ME2 <- persite_norm |>
  filter(virus == "Influenza_B_virus") |>
  filter(type == "dedup_TE") |>
  filter(Background == "p2") |>
  rename(Viral.load = `Viral load`) |>
  ggplot(aes(
    x = site,
    y = coverage,
    fill = as.character(Viral.load)
  )) +
  geom_col() +
  facet_grid(seg ~ Sample_id) +
  ylab("Coverage") +
  ggtitle("Influenza B virus (Background 2 deduplicated)") +
  guides(fill = guide_legend(title = "Viral load")) +
 theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_y_log10() +
  scale_fill_manual(values = cols4, label = c(expression("10"^"2"), expression("10"^"3"), expression("10"^"5")))


ggarrange(FLU_ME1, FLU_ME2, ncol = 2, common.legend = TRUE)

```

####***Final plot***
```{r}

ggsave("figures/manuscript_figure_2025/PDF/Figure_S8.pdf",
       width=20,
       height=12,
       dpi=600)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S8.png",
       width=20,
       height=12)
```


### --------------------

### Figure S9
```{r}

REO_ME1 <- 
  persite_norm |>
  filter(virus == "Mammalian_orthoreovirus3") |>
  filter(type == "dedup_TE") |>
  filter(Background == "p2") |>
  rename(Viral.load = `Viral load`) |>
  ggplot(aes(
    x = site,
    y = coverage,
    fill = as.character(Viral.load)
  )) +
  geom_col() +
  facet_grid(seg ~ Sample_id) +
  ylab("log10(Coverage)") +
  ggtitle("Mammalian orthoreovirus 3 (Background 1 deduplicated)") +
  guides(fill = guide_legend(title = "Viral load")) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top",
    panel.grid.major = element_line(linewidth = 0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth = 0.1, color = "lightgray")
  ) +
  scale_y_log10() +
  scale_fill_manual(values = cols4,
                    label = c(
                      expression("10" ^ "2"),
                      expression("10" ^ "3"),
                      expression("10" ^ "5")
                    ))

REO_ME2 <- 
  persite_norm |>
  filter(virus == "Mammalian_orthoreovirus3") |>
  filter(type == "dedup_TE") |>
  filter(Background == "p8") |>
  rename(Viral.load = `Viral load`) |>
  ggplot(aes(
    x = site,
    y = coverage,
    fill = as.character(Viral.load)
  )) +
  geom_col() +
  facet_grid(seg ~ Sample_id) +
  ylab("log10(Coverage)") +
  ggtitle("Mammalian orthoreovirus 3 (Background 2 deduplicated)") +
  guides(fill = guide_legend(title = "Viral load")) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top",
    panel.grid.major = element_line(linewidth = 0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth = 0.1, color = "lightgray")
  ) + scale_y_log10() +
  scale_fill_manual(values = cols4,
                    label = c(
                      expression("10" ^ "2"),
                      expression("10" ^ "3"),
                      expression("10" ^ "5")
                    ))

ggarrange(REO_ME1, REO_ME2, ncol = 2, common.legend = TRUE)

```

####***Final plot***
```{r}
ggsave("figures/manuscript_figure_2025/PDF/Figure_S9.pdf",
       width=20,
       height=12,
       dpi=600)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S9.png",
       width=20,
       height=12,
       dpi=600)
```




### --------------------

## >>>> Figure 3: all viruses - target and off target

### Read-in data
```{r}

# read in metadata

metadata <-
  read.csv("metadata/sampleIDs_TESpikeIn.csv", header = TRUE)

metadata2 <-
  metadata |>
  select(
    Sample.ID,
    Background.sample,
    Viral.load,
    Number.of.read.pairs..quality.adaptor.trimmed.
  ) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import viral reads mapped (to calculate proportions)

viral_reads_dedup <-
  read.csv(
    "data_TE/total_virus_mapped_reads_per_sample_dedup_atcc_ref_20241108.csv",
    header = TRUE
  )

viral_reads_nodedup <-
  read.csv(
    "data_TE/total_virus_mapped_reads_per_sample_nodedup_atcc_ref_20241108.csv",
    header = TRUE
  )

#import contingency tables

contingency_dedup <-
  read.csv(
    "data_TE/contingency_table_mapped_virus_reads_per_family_genus_sample_dedup_atcc_ref_20241106.csv",
    header = TRUE
  )

contingency_nodedup <-
  read.csv(
    "data_TE/contingency_table_mapped_virus_reads_per_family_genus_sample_nodedup_atcc_ref_20241106.csv",
    header = TRUE
  )

contaminants <-
  c("Betacoronavirus", "Alphainfluenzavirus", "Gammaretrovirus")

```

### Format data
```{r}

# pivot longer  
dedup_long <-
  contingency_dedup |>
  filter(!genus %in% contaminants) |>
  filter(genus != "NA") |>
  pivot_longer(cols = A:P,
               names_to = "Sample_id",
               values_to = "count") |>
  mutate(
    target = case_when(
      genus == "Cyclovirus" ~ "off_target",
      genus == "Cardiovirus" ~ "off_target",
      genus == "Kobuvirus" ~ "off_target",
      .default = "on_target"
    )
  ) |>
  mutate(genome_structure = case_when((genus == "Mastadenovirus" |
                                         genus == "Cytomegalovirus") ~ "DNA",
                                      .default = "RNA"
  ))

no_dedup_long <- contingency_nodedup |>
  filter(!genus %in% contaminants) |>
  filter(genus != "NA") |>
  pivot_longer(cols = A:P,
               names_to = "Sample_id",
               values_to = "count") |>
  mutate(
    target = case_when(
      genus == "Cyclovirus" ~ "off_target",
      genus == "Cardiovirus" ~ "off_target",
      genus == "Kobuvirus" ~ "off_target",
      .default = "on_target"
    )
  ) |>
  mutate(genome_structure = case_when((genus == "Mastadenovirus" |
                                         genus == "Cytomegalovirus") ~ "DNA",
                                      .default = "RNA"
  ))

metadata_dedup <- left_join(dedup_long, metadata2, by = "Sample_id")

dedup_viral_reads <-
  left_join(metadata_dedup, viral_reads_dedup, by = "Sample_id") |>
  mutate(total_reads = QC_reads * 2) |>
  mutate(prop_total = count / total_reads) |>
  mutate(prop_viral = count / total_virus_reads)

metadata_no_dedup <- left_join(no_dedup_long, metadata2, by = "Sample_id")

nodedup_viral_reads <-
  left_join(metadata_no_dedup, viral_reads_nodedup, by = "Sample_id") |>
  mutate(total_reads = QC_reads * 2) |>
  mutate(prop_total = count / total_reads) |>
  mutate(prop_viral = count / total_virus_reads)

```

### Make plots
```{r, warning=FALSE, message=FALSE, echo=FALSE}

facet_names <- c(
  "Msp_p2" = "P1",
  "Msp_p8" = "P2",
  "off_target" = "Background",
  "on_target" = "Spike-in"
)

cols <-
  c(
    "#CCBB44",
    "#332288",
    "#EE7733",
    "#66CCEE",
    "#882255",
    "#4477AA",
    "#AA3377",
    "#228833",
    "#EE6677"
  )


dedup_reads <- 
  dedup_viral_reads |>
  filter(Background.sample != "Neg_control") |>
  filter(Background.sample != "Msp_p6") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(count),
    colour = genus
  )) +
  geom_point() +
  geom_smooth(aes(group = genus, linetype = genome_structure), se = FALSE) +
  facet_grid(target ~ Background.sample, labeller = as_labeller(facet_names)) +
  theme_few() + 
  scale_x_log10() +
  ylab("Viral Reads") + 
  xlab("Viral load (gc/ml)") + 
  scale_color_manual(values = cols) +
  scale_linetype_manual(values=c("solid", "dotted"))+ 
  theme(
    axis.title.x = element_text(size = 12),
    # axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  )

dedup_prop_viral <-
  dedup_viral_reads |>
  filter(Background.sample != "Neg_control") |>
  filter(Background.sample != "Msp_p6") |>
  ggplot(aes(x = Viral.load, y = prop_viral, colour = genus)) +
  geom_point() +
  geom_smooth(aes(group = genus, linetype = genome_structure), se = FALSE) +
  facet_grid(target ~ Background.sample, labeller = as_labeller(facet_names)) +
  theme_few() + 
  scale_x_log10() +
  scale_y_log10() +
    scale_linetype_manual(values=c("solid", "dotted"))+ 
  ylab("Proportion of viral reads") +
  xlab("Viral load (gc/ml)") + 
  scale_color_manual(values = cols) +
    theme(
    axis.title.x = element_text(size = 12),
    # axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  )

dedup_prop_all <-
  dedup_viral_reads |>
  filter(Background.sample != "Neg_control") |>
  filter(Background.sample != "Msp_p6") |>
  ggplot(aes(x = Viral.load, y = prop_total, colour = genus)) +
  geom_point() +
  geom_smooth(aes(group = genus, linetype = genome_structure), se = FALSE) +
  facet_grid(target ~ Background.sample, labeller = as_labeller(facet_names)) +
  theme_few() + 
  scale_x_log10() +
  scale_y_log10(limits=c(5e-8,1)) +
  ylab("Proportion of total reads") +
  xlab("Viral load (gc/ml)") + 
  scale_linetype_manual(values=c("solid", "dotted"))+ 
  scale_color_manual(values = cols) +
    theme(
    axis.title.x = element_text(size = 12),
    # axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    axis.text.y = element_text(hjust = 1),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  )

```

### Dedup -- viral reads, prop all reads, and prop viral reads
```{r, warning=FALSE, message=FALSE, echo=FALSE, fig.width=8, fig.height=12}

ggarrange(
  dedup_reads,
  dedup_prop_all,
  dedup_prop_viral,
  nrow = 3,
  common.legend = TRUE,
  align = "hv"
)

```

### ***Final plot***

```{r, warning=FALSE, message=FALSE, echo=FALSE, fig.width=7, fig.height=8}

#ggsave("figures/compare_spike_ins_atcc/target_offtarget_dedup.png",width=8,height=10)

##just plot viral reads and prop viral

ggarrange(
  dedup_reads,
  dedup_prop_viral,
  nrow = 2,
  common.legend = TRUE,
  align = "hv"
)

ggsave("figures/compare_spike_ins_atcc/target_offtarget_dedup_2025-01-01.png", width = 10, height = 8)

ggsave("figures/manuscript_figures_pdf/Figure_3_2025-01-01.pdf", width = 10, height = 8)

```

### ----nodedup results ----
```{r, echo=FALSE, message=FALSE, warning=FALSE, fig.height=12, fig.width=6}
##extras - no dedup

nodedup_reads <-
  nodedup_viral_reads |>
  filter(Background.sample != "Neg_control") |>
  filter(Background.sample != "Msp_p6") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(count),
    colour = genus
  )) +
  geom_point() +
  geom_smooth(aes(group = genus, linetype = genome_structure), se = FALSE) +
  facet_grid(target ~ Background.sample, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_x_log10() +
  ylab("log10(Viral Reads)") +
  scale_color_manual(values = cols)

nodedup_prop_viral <- 
  nodedup_viral_reads |>
  filter(Background.sample != "Neg_control") |>
  filter(Background.sample != "Msp_p6") |>
  ggplot(aes(x = Viral.load, y = prop_viral, colour = genus)) +
  geom_point() +
  geom_smooth(aes(group = genus, linetype = genome_structure), se = FALSE) +
  facet_grid(target ~ Background.sample, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_x_log10() +
  scale_y_log10() +
  ylab("Proportion of viral reads") +
  scale_color_manual(values = cols)

nodedup_prop_all <- nodedup_viral_reads |>
  filter(Background.sample != "Neg_control") |>
  filter(Background.sample != "Msp_p6") |>
  ggplot(aes(x = Viral.load, y = prop_total, colour = genus)) +
  geom_point() +
  geom_smooth(aes(group = genus, linetype = genome_structure), se = FALSE) +
  facet_grid(target ~ Background.sample, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_x_log10() +
  scale_y_log10() +
  ylab("Proportion of total reads") +
  scale_color_manual(values = cols)

ggarrange(
  nodedup_reads,
  nodedup_prop_all,
  nodedup_prop_viral,
  nrow = 3,
  common.legend = TRUE,
  align = "hv"
)

#ggsave("figures/compare_spike_ins_atcc/target_offtarget_nodedup.pdf",width=8,height=10)

```

### --------------------

